import pandas as pd
import numpy as np
import plotly
import networkx
import matplotlib.pyplot as plt
# importing module
import pandas as pd
# dataset
data = pd.read_csv("C:\\Users\\rajat.k.srivastava\\Downloads\\Market_Basket_Optimisation.csv")
# printing the shape of the dataset
data.shape
(7500, 20)
# printing the heading
data.head()
| shrimp | almonds | avocado | vegetables mix | green grapes | whole weat flour | yams | cottage cheese | energy drink | tomato juice | low fat yogurt | green tea | honey | salad | mineral water | salmon | antioxydant juice | frozen smoothie | spinach | olive oil | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | burgers | meatballs | eggs | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | chutney | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | turkey | avocado | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | mineral water | milk | energy bar | whole wheat rice | green tea | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | low fat yogurt | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Top 10 items
# importing module
import numpy as np
# Gather All Items of Each Transactions into Numpy Array
transaction = []
for i in range(0, data.shape[0]):
for j in range(0, data.shape[1]):
transaction.append(data.values[i,j])
#converting to numpy array
transaction = np.array(transaction)
transaction
array(['burgers', 'meatballs', 'eggs', ..., 'nan', 'nan', 'nan'],
dtype='<U32')
# Transform Them a Pandas DataFrame
df = pd.DataFrame(transaction, columns=["items"])
df.head()
| items | |
|---|---|
| 0 | burgers |
| 1 | meatballs |
| 2 | eggs |
| 3 | nan |
| 4 | nan |
# Put 1 to Each Item For Making Countable Table, to be able to perform Group By
df["incident_count"] = 1
# Delete NaN Items from Dataset
indexNames = df[df['items'] == "nan" ].index
df.drop(indexNames , inplace=True)
# Making a New Appropriate Pandas DataFrame for Visualizations
df_table = df.groupby("items").sum().sort_values("incident_count", ascending=False).reset_index()
# Initial Visualizations
df_table.head(10).style.background_gradient(cmap='Greens')
| items | incident_count | |
|---|---|---|
| 0 | mineral water | 1787 |
| 1 | eggs | 1348 |
| 2 | spaghetti | 1306 |
| 3 | french fries | 1282 |
| 4 | chocolate | 1230 |
| 5 | green tea | 990 |
| 6 | milk | 972 |
| 7 | ground beef | 737 |
| 8 | frozen vegetables | 715 |
| 9 | pancakes | 713 |
# importing required module
import plotly.express as px
# to have a same origin
df_table["all"] = "all"
# creating tree map using plotly
fig = px.treemap(df_table.head(30), path=['all', "items"], values='incident_count',
color=df_table["incident_count"].head(30), hover_data=['items'],
color_continuous_scale='Greens',
)
# ploting the treemap
fig.show()
C:\Users\rajat.k.srivastava\Anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True) C:\Users\rajat.k.srivastava\Anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
# Transform Every Transaction to Seperate List & Gather Them into Numpy Array
transaction = []
for i in range(data.shape[0]):
transaction.append([str(data.values[i,j]) for j in range(data.shape[1])])
# creating the numpy array of the transactions
transaction = np.array(transaction)
# importing the required module
from mlxtend.preprocessing import TransactionEncoder
# initializing the transactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transaction).transform(transaction)
dataset = pd.DataFrame(te_ary, columns=te.columns_)
# dataset after encoded
dataset.head()
| asparagus | almonds | antioxydant juice | asparagus | avocado | babies food | bacon | barbecue sauce | black tea | blueberries | ... | turkey | vegetables mix | water spray | white wine | whole weat flour | whole wheat pasta | whole wheat rice | yams | yogurt cake | zucchini | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | True | False | False | False | False | False | ... | True | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | True | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
5 rows × 121 columns
# select top 50 items
first50 = df_table["items"].head(50).values
# Extract Top50
dataset = dataset.loc[:,first50]
# shape of the dataset
dataset.head()
| mineral water | eggs | spaghetti | french fries | chocolate | green tea | milk | ground beef | frozen vegetables | pancakes | ... | ham | energy bar | energy drink | pepper | cereals | vegetables mix | muffins | oil | french wine | fresh tuna | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | True | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 3 | True | False | False | False | False | True | True | False | False | False | ... | False | True | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
5 rows × 50 columns
# importing the required module
from mlxtend.frequent_patterns import apriori, association_rules
# Extracting the most frequest itemsets via Mlxtend.
# The length column has been added to increase ease of filtering.
frequent_itemsets = apriori(dataset, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# printing the frequent itemset
frequent_itemsets
| support | itemsets | length | |
|---|---|---|---|
| 0 | 0.238267 | (mineral water) | 1 |
| 1 | 0.179733 | (eggs) | 1 |
| 2 | 0.174133 | (spaghetti) | 1 |
| 3 | 0.170933 | (french fries) | 1 |
| 4 | 0.163867 | (chocolate) | 1 |
| ... | ... | ... | ... |
| 229 | 0.010933 | (ground beef, mineral water, chocolate) | 3 |
| 230 | 0.011067 | (ground beef, mineral water, milk) | 3 |
| 231 | 0.011067 | (mineral water, milk, frozen vegetables) | 3 |
| 232 | 0.010533 | (eggs, chocolate, spaghetti) | 3 |
| 233 | 0.010933 | (spaghetti, chocolate, milk) | 3 |
234 rows × 3 columns
# printing the frequntly items
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
(frequent_itemsets['support'] >= 0.05) ]
| support | itemsets | length | |
|---|---|---|---|
| 50 | 0.050933 | (eggs, mineral water) | 2 |
| 51 | 0.059733 | (spaghetti, mineral water) | 2 |
| 53 | 0.052667 | (mineral water, chocolate) | 2 |
# printing the frequntly items with length 3
frequent_itemsets[ (frequent_itemsets['length'] == 3) ].head(3)
| support | itemsets | length | |
|---|---|---|---|
| 217 | 0.014267 | (eggs, mineral water, spaghetti) | 3 |
| 218 | 0.013467 | (eggs, mineral water, chocolate) | 3 |
| 219 | 0.013067 | (eggs, mineral water, milk) | 3 |
# We set our metric as "Lift" to define whether antecedents & consequents are dependent our not
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules["antecedents_length"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequents_length"] = rules["consequents"].apply(lambda x: len(x))
rules.sort_values("lift",ascending=False)
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | antecedents_length | consequents_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 218 | (ground beef) | (herb & pepper) | 0.098267 | 0.049467 | 0.016000 | 0.162822 | 3.291555 | 0.011139 | 1.135402 | 1 | 1 |
| 219 | (herb & pepper) | (ground beef) | 0.049467 | 0.098267 | 0.016000 | 0.323450 | 3.291555 | 0.011139 | 1.332841 | 1 | 1 |
| 295 | (ground beef) | (spaghetti, mineral water) | 0.098267 | 0.059733 | 0.017067 | 0.173677 | 2.907540 | 0.011197 | 1.137893 | 1 | 2 |
| 290 | (spaghetti, mineral water) | (ground beef) | 0.059733 | 0.098267 | 0.017067 | 0.285714 | 2.907540 | 0.011197 | 1.262427 | 2 | 1 |
| 312 | (olive oil) | (spaghetti, mineral water) | 0.065733 | 0.059733 | 0.010267 | 0.156187 | 2.614731 | 0.006340 | 1.114306 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 60 | (eggs) | (low fat yogurt) | 0.179733 | 0.076400 | 0.016800 | 0.093472 | 1.223453 | 0.003068 | 1.018832 | 1 | 1 |
| 122 | (escalope) | (french fries) | 0.079333 | 0.170933 | 0.016400 | 0.206723 | 1.209376 | 0.002839 | 1.045116 | 1 | 1 |
| 123 | (french fries) | (escalope) | 0.170933 | 0.079333 | 0.016400 | 0.095944 | 1.209376 | 0.002839 | 1.018373 | 1 | 1 |
| 164 | (shrimp) | (green tea) | 0.071333 | 0.132000 | 0.011333 | 0.158879 | 1.203625 | 0.001917 | 1.031956 | 1 | 1 |
| 165 | (green tea) | (shrimp) | 0.132000 | 0.071333 | 0.011333 | 0.085859 | 1.203625 | 0.001917 | 1.015890 | 1 | 1 |
350 rows × 11 columns
# Sort values based on confidence
rules.sort_values("confidence",ascending=False)
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | antecedents_length | consequents_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 269 | (eggs, ground beef) | (mineral water) | 0.020000 | 0.238267 | 0.010133 | 0.506667 | 2.126469 | 0.005368 | 1.544054 | 2 | 1 |
| 327 | (ground beef, milk) | (mineral water) | 0.022000 | 0.238267 | 0.011067 | 0.503030 | 2.111207 | 0.005825 | 1.532756 | 2 | 1 |
| 321 | (ground beef, chocolate) | (mineral water) | 0.023067 | 0.238267 | 0.010933 | 0.473988 | 1.989319 | 0.005437 | 1.448130 | 2 | 1 |
| 334 | (milk, frozen vegetables) | (mineral water) | 0.023600 | 0.238267 | 0.011067 | 0.468927 | 1.968075 | 0.005444 | 1.434328 | 2 | 1 |
| 34 | (soup) | (mineral water) | 0.050533 | 0.238267 | 0.023067 | 0.456464 | 1.915771 | 0.011026 | 1.401441 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46 | (mineral water) | (red wine) | 0.238267 | 0.028133 | 0.010933 | 0.045887 | 1.631053 | 0.004230 | 1.018607 | 1 | 1 |
| 313 | (mineral water) | (spaghetti, olive oil) | 0.238267 | 0.022933 | 0.010267 | 0.043089 | 1.878880 | 0.004802 | 1.021063 | 1 | 2 |
| 49 | (mineral water) | (cereals) | 0.238267 | 0.025733 | 0.010267 | 0.043089 | 1.674442 | 0.004135 | 1.018137 | 1 | 1 |
| 272 | (mineral water) | (eggs, ground beef) | 0.238267 | 0.020000 | 0.010133 | 0.042529 | 2.126469 | 0.005368 | 1.023530 | 1 | 2 |
| 277 | (mineral water) | (spaghetti, french fries) | 0.238267 | 0.027600 | 0.010133 | 0.042529 | 1.540920 | 0.003557 | 1.015593 | 1 | 2 |
350 rows × 11 columns